In [20]:
# Raw Package
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

#machine learning models
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict, train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler


#Data Source
from yahoo_fin import stock_info
import yfinance as yf

# plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting parameters tuning
sns.set_style('whitegrid')
sns.set_context('talk')
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (30, 10),
          'axes.labelsize': 'x-large',
          'axes.titlesize':'x-large',
          'xtick.labelsize':'x-large',
          'ytick.labelsize':'x-large'}

plt.rcParams.update(params)
In [2]:
df = yf.download("AAPL", start="2010-01-01", end="2022-05-12") # interval='1m'
[*********************100%***********************]  1 of 1 completed

Interval data

interval='1m', minute level data is available for the latest 7 days period interval='1h', hour level data is available for the latets 700 days period

In [3]:
df.head(10)
Out[3]:
Open High Low Close Adj Close Volume
Date
2010-01-04 7.622500 7.660714 7.585000 7.643214 6.535086 493729600
2010-01-05 7.664286 7.699643 7.616071 7.656429 6.546383 601904800
2010-01-06 7.656429 7.686786 7.526786 7.534643 6.442255 552160000
2010-01-07 7.562500 7.571429 7.466071 7.520714 6.430346 477131200
2010-01-08 7.510714 7.571429 7.466429 7.570714 6.473098 447610800
2010-01-11 7.600000 7.607143 7.444643 7.503929 6.415995 462229600
2010-01-12 7.471071 7.491786 7.372143 7.418571 6.343011 594459600
2010-01-13 7.423929 7.533214 7.289286 7.523214 6.432484 605892000
2010-01-14 7.503929 7.516429 7.465000 7.479643 6.395228 432894000
2010-01-15 7.533214 7.557143 7.352500 7.354643 6.288351 594067600
In [4]:
df.shape
Out[4]:
(3111, 6)
In [5]:
df.describe()
Out[5]:
Open High Low Close Adj Close Volume
count 3111.000000 3111.000000 3111.000000 3111.000000 3111.000000 3.111000e+03
mean 46.292615 46.787252 45.800019 46.310659 44.512253 2.651923e+08
std 42.907589 43.434071 42.387189 42.927175 43.405764 2.246157e+08
min 6.870357 7.000000 6.794643 6.858929 5.864507 4.100000e+07
25% 18.668750 18.860893 18.513036 18.695536 16.401495 1.069804e+08
50% 28.822500 29.087500 28.502501 28.797501 26.446293 1.780110e+08
75% 51.071249 51.567499 50.698750 51.121250 49.796648 3.612910e+08
max 182.630005 182.940002 179.119995 182.009995 181.511703 1.880998e+09
In [6]:
df.isnull().sum()
Out[6]:
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64
In [7]:
#feature engineering
df['weekday'] = np.where(df.index.weekday > 5, 1, 0)
df['year'] = df.index.year
df['month'] = df.index.month
df['day'] = df.index.day
df['close_price_lag1'] = df['Close'].shift(1)
df['close_price_lag2'] = df['Close'].shift(2)
df['close_price_lag3'] = df['Close'].shift(3)
df['close_price_lag4'] = df['Close'].shift(4)
df['open_price_lag1'] = df['Open'].shift(1)
df['open_price_lag2'] = df['Open'].shift(2)
df['open_price_lag3'] = df['Open'].shift(3)
df['open_price_lag4'] = df['Open'].shift(4)
df['high_price_lag1'] = df['High'].shift(1)
df['low_price_lag1'] = df['Low'].shift(1)
df['close_price_error_percent'] = df['Close'].pct_change()
df['open_price_error_percent'] = df['Open'].pct_change()
df['high_price_error_percent'] = df['High'].pct_change()
df['low_price_error_percent'] = df['Low'].pct_change()
df['adj_close_price_error_percent'] = df['Adj Close'].pct_change()
In [8]:
df.head()
Out[8]:
Open High Low Close Adj Close Volume weekday year month day ... open_price_lag2 open_price_lag3 open_price_lag4 high_price_lag1 low_price_lag1 close_price_error_percent open_price_error_percent high_price_error_percent low_price_error_percent adj_close_price_error_percent
Date
2010-01-04 7.622500 7.660714 7.585000 7.643214 6.535086 493729600 0 2010 1 4 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2010-01-05 7.664286 7.699643 7.616071 7.656429 6.546383 601904800 0 2010 1 5 ... NaN NaN NaN 7.660714 7.585000 0.001729 0.005482 0.005082 0.004096 0.001729
2010-01-06 7.656429 7.686786 7.526786 7.534643 6.442255 552160000 0 2010 1 6 ... 7.622500 NaN NaN 7.699643 7.616071 -0.015906 -0.001025 -0.001670 -0.011723 -0.015906
2010-01-07 7.562500 7.571429 7.466071 7.520714 6.430346 477131200 0 2010 1 7 ... 7.664286 7.622500 NaN 7.686786 7.526786 -0.001849 -0.012268 -0.015007 -0.008066 -0.001849
2010-01-08 7.510714 7.571429 7.466429 7.570714 6.473098 447610800 0 2010 1 8 ... 7.656429 7.664286 7.6225 7.571429 7.466071 0.006648 -0.006848 0.000000 0.000048 0.006648

5 rows × 25 columns

In [9]:
df.isnull().sum()
Out[9]:
Open                             0
High                             0
Low                              0
Close                            0
Adj Close                        0
Volume                           0
weekday                          0
year                             0
month                            0
day                              0
close_price_lag1                 1
close_price_lag2                 2
close_price_lag3                 3
close_price_lag4                 4
open_price_lag1                  1
open_price_lag2                  2
open_price_lag3                  3
open_price_lag4                  4
high_price_lag1                  1
low_price_lag1                   1
close_price_error_percent        1
open_price_error_percent         1
high_price_error_percent         1
low_price_error_percent          1
adj_close_price_error_percent    1
dtype: int64
In [11]:
df.dropna(inplace=True) # dropped rows with the null values
In [12]:
df.isnull().sum()
Out[12]:
Open                             0
High                             0
Low                              0
Close                            0
Adj Close                        0
Volume                           0
weekday                          0
year                             0
month                            0
day                              0
close_price_lag1                 0
close_price_lag2                 0
close_price_lag3                 0
close_price_lag4                 0
open_price_lag1                  0
open_price_lag2                  0
open_price_lag3                  0
open_price_lag4                  0
high_price_lag1                  0
low_price_lag1                   0
close_price_error_percent        0
open_price_error_percent         0
high_price_error_percent         0
low_price_error_percent          0
adj_close_price_error_percent    0
dtype: int64
In [13]:
print(df.shape)
df.head()
(3107, 25)
Out[13]:
Open High Low Close Adj Close Volume weekday year month day ... open_price_lag2 open_price_lag3 open_price_lag4 high_price_lag1 low_price_lag1 close_price_error_percent open_price_error_percent high_price_error_percent low_price_error_percent adj_close_price_error_percent
Date
2010-01-08 7.510714 7.571429 7.466429 7.570714 6.473097 447610800 0 2010 1 8 ... 7.656429 7.664286 7.622500 7.571429 7.466071 0.006648 -0.006848 0.000000 0.000048 0.006648
2010-01-11 7.600000 7.607143 7.444643 7.503929 6.415994 462229600 0 2010 1 11 ... 7.562500 7.656429 7.664286 7.571429 7.466429 -0.008821 0.011888 0.004717 -0.002918 -0.008822
2010-01-12 7.471071 7.491786 7.372143 7.418571 6.343012 594459600 0 2010 1 12 ... 7.510714 7.562500 7.656429 7.607143 7.444643 -0.011375 -0.016964 -0.015164 -0.009739 -0.011375
2010-01-13 7.423929 7.533214 7.289286 7.523214 6.432484 605892000 0 2010 1 13 ... 7.600000 7.510714 7.562500 7.491786 7.372143 0.014106 -0.006310 0.005530 -0.011239 0.014106
2010-01-14 7.503929 7.516429 7.465000 7.479643 6.395229 432894000 0 2010 1 14 ... 7.471071 7.600000 7.510714 7.533214 7.289286 -0.005792 0.010776 -0.002228 0.024106 -0.005792

5 rows × 25 columns

In [10]:
df['Close'].plot(figsize=(15,5))
Out[10]:
<AxesSubplot:xlabel='Date'>
In [11]:
plt = sns.factorplot('month','Close',hue='year',data=df,height=10,aspect=2,legend=True)
plt.tight_layout()
In [12]:
df = df.dropna()
df_sorted = df.sort_values(by='Date', ascending=False)
In [13]:
df_sorted.head()
df = df_sorted
df.head()
Out[13]:
Open High Low Close Adj Close Volume weekday year month day ... open_price_lag2 open_price_lag3 open_price_lag4 high_price_lag1 low_price_lag1 close_price_error_percent open_price_error_percent high_price_error_percent low_price_error_percent adj_close_price_error_percent
Date
2022-05-11 153.500000 155.449997 145.809998 146.500000 146.500000 142689800 0 2022 5 11 ... 154.929993 156.009995 163.850006 156.740005 152.929993 -0.051841 -0.012989 -0.008230 -0.046557 -0.051841
2022-05-10 155.520004 156.740005 152.929993 154.509995 154.509995 115366700 0 2022 5 10 ... 156.009995 163.850006 159.669998 155.830002 151.490005 0.016112 0.003808 0.005840 0.009505 0.016112
2022-05-09 154.929993 155.830002 151.490005 152.059998 152.059998 131577900 0 2022 5 9 ... 163.850006 159.669998 158.149994 159.440002 154.179993 -0.033189 -0.006923 -0.022642 -0.017447 -0.033189
2022-05-06 156.009995 159.440002 154.179993 157.279999 157.279999 116055700 0 2022 5 6 ... 159.669998 158.149994 156.710007 164.080002 154.949997 0.003253 -0.047849 -0.028279 -0.004969 0.004727
2022-05-05 163.850006 164.080002 154.949997 156.770004 156.540009 130525300 0 2022 5 5 ... 158.149994 156.710007 161.839996 166.479996 159.259995 -0.055716 0.026179 -0.014416 -0.027063 -0.055716

5 rows × 25 columns

In [14]:
#moving the target column to the last
column_to_reorder = df.pop('Close')
df.insert(len(df. columns), 'Close', column_to_reorder)
df.head()
Out[14]:
Open High Low Adj Close Volume weekday year month day close_price_lag1 ... open_price_lag3 open_price_lag4 high_price_lag1 low_price_lag1 close_price_error_percent open_price_error_percent high_price_error_percent low_price_error_percent adj_close_price_error_percent Close
Date
2022-05-11 153.500000 155.449997 145.809998 146.500000 142689800 0 2022 5 11 154.509995 ... 156.009995 163.850006 156.740005 152.929993 -0.051841 -0.012989 -0.008230 -0.046557 -0.051841 146.500000
2022-05-10 155.520004 156.740005 152.929993 154.509995 115366700 0 2022 5 10 152.059998 ... 163.850006 159.669998 155.830002 151.490005 0.016112 0.003808 0.005840 0.009505 0.016112 154.509995
2022-05-09 154.929993 155.830002 151.490005 152.059998 131577900 0 2022 5 9 157.279999 ... 159.669998 158.149994 159.440002 154.179993 -0.033189 -0.006923 -0.022642 -0.017447 -0.033189 152.059998
2022-05-06 156.009995 159.440002 154.179993 157.279999 116055700 0 2022 5 6 156.770004 ... 158.149994 156.710007 164.080002 154.949997 0.003253 -0.047849 -0.028279 -0.004969 0.004727 157.279999
2022-05-05 163.850006 164.080002 154.949997 156.540009 130525300 0 2022 5 5 166.020004 ... 156.710007 161.839996 166.479996 159.259995 -0.055716 0.026179 -0.014416 -0.027063 -0.055716 156.770004

5 rows × 25 columns

Modeling

LSTM Neural Networks

LSTM is a time-series model known as Long Short-Term Memory. LSTM models are powerful, especially for retaining a long-term memory

In [15]:
from sklearn.preprocessing import MinMaxScaler # scale the dataset
import pickle # save and retrive any python objects
from tqdm.notebook import tnrange # graphical progress bar to track the pre-progress of our preprocessing
import plotly.graph_objects as go
import tensorflow as tf
In [16]:
df.head()
Out[16]:
Open High Low Adj Close Volume weekday year month day close_price_lag1 ... open_price_lag3 open_price_lag4 high_price_lag1 low_price_lag1 close_price_error_percent open_price_error_percent high_price_error_percent low_price_error_percent adj_close_price_error_percent Close
Date
2022-05-11 153.500000 155.449997 145.809998 146.500000 142689800 0 2022 5 11 154.509995 ... 156.009995 163.850006 156.740005 152.929993 -0.051841 -0.012989 -0.008230 -0.046557 -0.051841 146.500000
2022-05-10 155.520004 156.740005 152.929993 154.509995 115366700 0 2022 5 10 152.059998 ... 163.850006 159.669998 155.830002 151.490005 0.016112 0.003808 0.005840 0.009505 0.016112 154.509995
2022-05-09 154.929993 155.830002 151.490005 152.059998 131577900 0 2022 5 9 157.279999 ... 159.669998 158.149994 159.440002 154.179993 -0.033189 -0.006923 -0.022642 -0.017447 -0.033189 152.059998
2022-05-06 156.009995 159.440002 154.179993 157.279999 116055700 0 2022 5 6 156.770004 ... 158.149994 156.710007 164.080002 154.949997 0.003253 -0.047849 -0.028279 -0.004969 0.004727 157.279999
2022-05-05 163.850006 164.080002 154.949997 156.540009 130525300 0 2022 5 5 166.020004 ... 156.710007 161.839996 166.479996 159.259995 -0.055716 0.026179 -0.014416 -0.027063 -0.055716 156.770004

5 rows × 25 columns

In [17]:
df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3107 entries, 2022-05-11 to 2010-01-08
Data columns (total 25 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Open                           3107 non-null   float64
 1   High                           3107 non-null   float64
 2   Low                            3107 non-null   float64
 3   Adj Close                      3107 non-null   float64
 4   Volume                         3107 non-null   int64  
 5   weekday                        3107 non-null   int32  
 6   year                           3107 non-null   int64  
 7   month                          3107 non-null   int64  
 8   day                            3107 non-null   int64  
 9   close_price_lag1               3107 non-null   float64
 10  close_price_lag2               3107 non-null   float64
 11  close_price_lag3               3107 non-null   float64
 12  close_price_lag4               3107 non-null   float64
 13  open_price_lag1                3107 non-null   float64
 14  open_price_lag2                3107 non-null   float64
 15  open_price_lag3                3107 non-null   float64
 16  open_price_lag4                3107 non-null   float64
 17  high_price_lag1                3107 non-null   float64
 18  low_price_lag1                 3107 non-null   float64
 19  close_price_error_percent      3107 non-null   float64
 20  open_price_error_percent       3107 non-null   float64
 21  high_price_error_percent       3107 non-null   float64
 22  low_price_error_percent        3107 non-null   float64
 23  adj_close_price_error_percent  3107 non-null   float64
 24  Close                          3107 non-null   float64
dtypes: float64(20), int32(1), int64(4)
memory usage: 619.0 KB
In [18]:
# filter only required data
# column open, close, high and low have very common data so choose only one among them
data = df[['Open','High','Low','Close', 'Volume']]
data.head(10)
Out[18]:
Open High Low Close Volume
Date
2022-05-11 153.500000 155.449997 145.809998 146.500000 142689800
2022-05-10 155.520004 156.740005 152.929993 154.509995 115366700
2022-05-09 154.929993 155.830002 151.490005 152.059998 131577900
2022-05-06 156.009995 159.440002 154.179993 157.279999 116055700
2022-05-05 163.850006 164.080002 154.949997 156.770004 130525300
2022-05-04 159.669998 166.479996 159.259995 166.020004 108256500
2022-05-03 158.149994 160.710007 156.320007 159.479996 88966500
2022-05-02 156.710007 158.229996 153.270004 157.960007 123055300
2022-04-29 161.839996 166.199997 157.250000 157.649994 131587100
2022-04-28 159.250000 164.520004 158.929993 163.639999 130216800
In [22]:
%matplotlib inline
plt.figure(figsize=(16,6))
plt.title('Close Price History')
plt.plot(df['Close'])
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price USD ($)', fontsize=15)
plt.show()
In [23]:
# set up test set size
#Test Data is selected '2018-01-01 till current' 
test_length = data[(data.index >= '2017-12-31')].shape[0]
test_length
Out[23]:
1098
In [24]:
def Features_and_Targets(data, feature_length):
  # create features and targets

  X = []
  y = []

  for i in tnrange(len(data) - feature_length):
    X.append(data.iloc[i:i+feature_length, :].values)
    y.append(data['Close'].values[i+feature_length])

  X = np.array(X)
  y = np.array(y)

  return X, y
In [25]:
X, y = Features_and_Targets(data, 32)
In [26]:
X.shape, y.shape # record, days, values
Out[26]:
((3075, 32, 5), (3075,))
In [27]:
# split into train and test set
X_train, X_test, y_train, y_test = X[:-test_length], X[-test_length:], y[:-test_length], y[-test_length:]
In [28]:
# check training dataset
X_train.shape, y_train.shape
Out[28]:
((1977, 32, 5), (1977,))
In [29]:
# check testing dataset
X_test.shape, y_test.shape
Out[29]:
((1098, 32, 5), (1098,))
In [30]:
# create a scaler to scale vectors with multiple dimensions 
# sklearn only supports scaling on two dimensions but we need to do it on three dimensions
class MultiDimensionScaler():

  # initialize an empty list of scalers
  def __init__(self):
    self.scalers = []

  # looping over third dimension of our data and at each loop we create a new scaler and fit it over that dimension
  def fit_transform(self , X):
    total_dims = X.shape[2]
    for i in range(total_dims):
        Scaler = MinMaxScaler()
        X[:, :, i] = Scaler.fit_transform(X[:, :, i])
        self.scalers.append(Scaler)
    return X # keep collecting the fitted scalers and returns the transformed data


  # again do over the third dimension but apply fitted scalers on the transformed data
  def transform(self, X):
    for i in range(X.shape[2]):
      X[:, :, i] = self.scalers[i].transform(X[:, :, i])
    return X
In [31]:
# apply the above on our features
Feature_Scaler = MultiDimensionScaler()
X_train = Feature_Scaler.fit_transform(X_train)
X_test = Feature_Scaler.transform(X_test)
In [32]:
# apply the MinMaxScaler on targets
Target_Scaler = MinMaxScaler()
y_train = Target_Scaler.fit_transform(y_train.reshape(-1,1))
y_test = Target_Scaler.transform(y_test.reshape(-1,1))
In [33]:
# create two functions to save and load python objects
def save_object(obj, name: str):
  pickle_out = open(f"{name}.pck", "wb")
  pickle.dump(obj, pickle_out)
  pickle_out.close()

def load_obj(name: str):
  pickle_in = open(f"{name}.pck", "rb")
  data = pickle.load(pickle_in)
  return data
In [34]:
# save the objects for future use
save_object(Feature_Scaler, "Feature_Scaler")
save_object(Target_Scaler, "Target_Scaler")
Model Building
In [35]:
# define callbacks
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

save_best = ModelCheckpoint("best_weights.h5", monitor='val_loss', 
                            save_best_only=True, save_weights_only=True)

reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.25, 
                              patience=5, min_lr=0.00001, verbose=1)
In [36]:
#creating model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional

model = Sequential()

model.add(Bidirectional(LSTM(600, return_sequences=True, recurrent_dropout=0.1, input_shape=(32,2)))) # return sequence=True because sequence matters
model.add(LSTM(256, recurrent_dropout=0.1))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu')) # modified version of leaky relu (Relu+Tanh merged)
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))
In [37]:
# compile the model
optimizer = tf.keras.optimizers.SGD(learning_rate=0.002) 
# use SGD because of batch size, we need a frequent update so batch size has to be low, 
# and SGD is better wrt Adam with lesser batch size

model.compile(loss='mse', optimizer=optimizer, metrics=['mse', 'mae'])
In [38]:
# Fitting the model
history = model.fit(X_train, y_train,
                    epochs=1, batch_size=1,
                    verbose=1, shuffle=False,
                    validation_data=(X_test, y_test),
                    callbacks=[reduce_lr, save_best])
1977/1977 [==============================] - 3122s 2s/step - loss: 0.0151 - mse: 0.0151 - mae: 0.0569 - val_loss: 0.0033 - val_mse: 0.0033 - val_mae: 0.0525 - lr: 0.0020
In [39]:
# After the model has been constructed, we'll summarise it
from tensorflow.keras.utils import plot_model
import pydot
import graphviz
print(model.summary())
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 bidirectional (Bidirectiona  (1, 32, 1200)            2908800   
 l)                                                              
                                                                 
 lstm_1 (LSTM)               (1, 256)                  1491968   
                                                                 
 dropout (Dropout)           (1, 256)                  0         
                                                                 
 dense (Dense)               (1, 64)                   16448     
                                                                 
 dropout_1 (Dropout)         (1, 64)                   0         
                                                                 
 dense_1 (Dense)             (1, 32)                   2080      
                                                                 
 dense_2 (Dense)             (1, 1)                    33        
                                                                 
=================================================================
Total params: 4,419,329
Trainable params: 4,419,329
Non-trainable params: 0
_________________________________________________________________
None
In [40]:
#Prediction
pred = model.predict(X_test)
In [41]:
pred
Out[41]:
array([[ 0.02393947],
       [ 0.02413179],
       [ 0.02426072],
       ...,
       [-0.00356807],
       [-0.00296667],
       [-0.00206649]], dtype=float32)
In [43]:
pred.shape
Out[43]:
(1098, 1)
In [44]:
y_test.shape
Out[44]:
(1098, 1)
In [45]:
Predictions = Target_Scaler.inverse_transform(pred)
Actual = Target_Scaler.inverse_transform(y_test)
In [46]:
Predictions.shape
Out[46]:
(1098, 1)
In [48]:
#Model Performance
# Get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(((pred - y_test) ** 2)))
print("RMSE for LSTM before Regularization:",rmse)
RMSE for LSTM before Regularization: 0.05740772693407133
In [49]:
Predictions = np.squeeze(Predictions, axis=1) # removes any unwanted axis with value "1 "
Actual = np.squeeze(Actual, axis=1)
In [50]:
# plot Actual vs Predicted Values
%matplotlib inline
fig = go.Figure()
fig.update_layout(
    title="Actual Closing Value vs Predicted Closing Value",
    yaxis_title="Closing Stock Value",
    xaxis_title="Year of Closing Price in Apple ",
    legend_title="Actual vs Prediction",
    font=dict(
        family="Courier New, monospace",
        size=13,
        color="RebeccaPurple"
    )
)

fig.add_trace(go.Scatter(x = data.index[-test_length:], y = Actual, mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x = data.index[-test_length:], y = Predictions, mode='lines', name='Predictions'))
fig.show()
In [51]:
#Visualizations of Whole Data
Total_features = np.concatenate((X_train, X_test), axis=0)
Total_Targets = np.concatenate((y_train, y_test), axis=0)
In [52]:
Total_features.shape
Out[52]:
(3075, 32, 5)
In [53]:
#Prediction of all features
pred_wh = model.predict(Total_features)
In [54]:
pred_wh
Out[54]:
array([[ 0.24437135],
       [ 0.24506369],
       [ 0.24540567],
       ...,
       [-0.00356808],
       [-0.00296669],
       [-0.0020665 ]], dtype=float32)
In [55]:
Predictions = Target_Scaler.inverse_transform(pred_wh)
Actual = Target_Scaler.inverse_transform(Total_Targets)
In [56]:
Predictions = np.squeeze(Predictions, axis=1)
Actual = np.squeeze(Actual, axis=1)
In [57]:
Predictions
Out[57]:
array([60.840355, 60.951374, 61.006214, ..., 21.081764, 21.1782  ,
       21.322554], dtype=float32)
In [58]:
# check the trend in Close price traded
%matplotlib inline

fig = go.Figure()
fig.update_layout(
    title="Stock Price of Apple ",
    yaxis_title="Closing Stock Value",
    xaxis_title="Year of Stock Price in Apple",
    legend_title="Actual vs Prediction",
    font=dict(
        family="Courier New, monospace",
        size=13,
        color="RebeccaPurple"
    )
)
fig.add_trace(go.Scatter(x=data.index, y=Actual, mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=data.index, y=Predictions, mode='lines', name='Predictions'))
fig.show()
In [59]:
# save and load the whole model
model.save("LSTM_Yfinance.h5")
loaded_model = tf.keras.models.load_model("LSTM_Yfinance.h5")

Predicting the Stock Price

In [60]:
def PredictStockPrice(Model, DataFrame, PreviousDate, feature_length=32):
  idx_location = DataFrame.index.get_loc(PreviousDate)
  Features = DataFrame.iloc[idx_location - feature_length: idx_location, :].values
  Features = np.expand_dims(Features, axis=0)
  Features = Feature_Scaler.transform(Features)
  Prediction = Model.predict(Features)
  Prediction = Target_Scaler.inverse_transform(Prediction)
  return Prediction[0][0]
In [65]:
PredictStockPrice(loaded_model, data, '2022-03-02')
Out[65]:
58.550423

Regularization

Dropout Regularization in LSTM

Dropout is a regularization method that approximates training a large number of neural networks with different architectures in parallel.

In [66]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from datetime import datetime
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, GRU
from keras.layers import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
In [67]:
#Preparation
#Convert date
def to_datetime(df):
    date = datetime.strptime(df, '%d.%m.%Y')
    return date.strftime("%Y-%m-%d")
In [68]:
df.head()
Out[68]:
Open High Low Adj Close Volume weekday year month day close_price_lag1 ... open_price_lag3 open_price_lag4 high_price_lag1 low_price_lag1 close_price_error_percent open_price_error_percent high_price_error_percent low_price_error_percent adj_close_price_error_percent Close
Date
2022-05-11 153.500000 155.449997 145.809998 146.500000 142689800 0 2022 5 11 154.509995 ... 156.009995 163.850006 156.740005 152.929993 -0.051841 -0.012989 -0.008230 -0.046557 -0.051841 146.500000
2022-05-10 155.520004 156.740005 152.929993 154.509995 115366700 0 2022 5 10 152.059998 ... 163.850006 159.669998 155.830002 151.490005 0.016112 0.003808 0.005840 0.009505 0.016112 154.509995
2022-05-09 154.929993 155.830002 151.490005 152.059998 131577900 0 2022 5 9 157.279999 ... 159.669998 158.149994 159.440002 154.179993 -0.033189 -0.006923 -0.022642 -0.017447 -0.033189 152.059998
2022-05-06 156.009995 159.440002 154.179993 157.279999 116055700 0 2022 5 6 156.770004 ... 158.149994 156.710007 164.080002 154.949997 0.003253 -0.047849 -0.028279 -0.004969 0.004727 157.279999
2022-05-05 163.850006 164.080002 154.949997 156.540009 130525300 0 2022 5 5 166.020004 ... 156.710007 161.839996 166.479996 159.259995 -0.055716 0.026179 -0.014416 -0.027063 -0.055716 156.770004

5 rows × 25 columns

In [69]:
df.reset_index(inplace=True)
df.head()
Out[69]:
Date Open High Low Adj Close Volume weekday year month day ... open_price_lag3 open_price_lag4 high_price_lag1 low_price_lag1 close_price_error_percent open_price_error_percent high_price_error_percent low_price_error_percent adj_close_price_error_percent Close
0 2022-05-11 153.500000 155.449997 145.809998 146.500000 142689800 0 2022 5 11 ... 156.009995 163.850006 156.740005 152.929993 -0.051841 -0.012989 -0.008230 -0.046557 -0.051841 146.500000
1 2022-05-10 155.520004 156.740005 152.929993 154.509995 115366700 0 2022 5 10 ... 163.850006 159.669998 155.830002 151.490005 0.016112 0.003808 0.005840 0.009505 0.016112 154.509995
2 2022-05-09 154.929993 155.830002 151.490005 152.059998 131577900 0 2022 5 9 ... 159.669998 158.149994 159.440002 154.179993 -0.033189 -0.006923 -0.022642 -0.017447 -0.033189 152.059998
3 2022-05-06 156.009995 159.440002 154.179993 157.279999 116055700 0 2022 5 6 ... 158.149994 156.710007 164.080002 154.949997 0.003253 -0.047849 -0.028279 -0.004969 0.004727 157.279999
4 2022-05-05 163.850006 164.080002 154.949997 156.540009 130525300 0 2022 5 5 ... 156.710007 161.839996 166.479996 159.259995 -0.055716 0.026179 -0.014416 -0.027063 -0.055716 156.770004

5 rows × 26 columns

In [70]:
df.shape
Out[70]:
(3107, 26)
In [71]:
#train test split based on year
num_shape = 1900

train = df.iloc[:num_shape, 1:2].values
test = df.iloc[num_shape:, 1:2].values
In [72]:
sc = MinMaxScaler(feature_range = (0, 1))
train_scaled = sc.fit_transform(train)
In [73]:
#take one row and cut it with a window of 60 elements
X_train = []

#Price on next day
y_train = []

window = 60

for i in range(window, num_shape):
    X_train_ = np.reshape(train_scaled[i-window:i, 0], (window, 1))
    X_train.append(X_train_)
    y_train.append(train_scaled[i, 0])
X_train = np.stack(X_train)
y_train = np.stack(y_train)
In [74]:
#LSTM model using dropout Regularization
# Initializing the Recurrent Neural Network
model = Sequential()
#Adding the first LSTM layer with a sigmoid activation function and some Dropout regularization
#Units - dimensionality of the output space

model.add(LSTM(units = 50, return_sequences = True, input_shape = (X_train.shape[1], 1)))
model.add(Dropout(0.5))

model.add(LSTM(units = 50, return_sequences = True))
model.add(Dropout(0.5))

model.add(LSTM(units = 50, return_sequences = True))
model.add(Dropout(0.5))

model.add(LSTM(units = 50))
model.add(Dropout(0.5))

# Adding the output layer
model.add(Dense(units = 1))
model.summary()
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 lstm_2 (LSTM)               (None, 60, 50)            10400     
                                                                 
 dropout_2 (Dropout)         (None, 60, 50)            0         
                                                                 
 lstm_3 (LSTM)               (None, 60, 50)            20200     
                                                                 
 dropout_3 (Dropout)         (None, 60, 50)            0         
                                                                 
 lstm_4 (LSTM)               (None, 60, 50)            20200     
                                                                 
 dropout_4 (Dropout)         (None, 60, 50)            0         
                                                                 
 lstm_5 (LSTM)               (None, 50)                20200     
                                                                 
 dropout_5 (Dropout)         (None, 50)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 51        
                                                                 
=================================================================
Total params: 71,051
Trainable params: 71,051
Non-trainable params: 0
_________________________________________________________________
In [79]:
model.compile(optimizer = 'adam', loss = 'mean_squared_error',metrics=["mse","mae"])
In [80]:
model.fit(X_train, y_train, epochs = 10,batch_size = 32);
Epoch 1/10
58/58 [==============================] - 12s 87ms/step - loss: 0.0063 - mse: 0.0063 - mae: 0.0461
Epoch 2/10
58/58 [==============================] - 5s 93ms/step - loss: 0.0061 - mse: 0.0061 - mae: 0.0476
Epoch 3/10
58/58 [==============================] - 5s 93ms/step - loss: 0.0046 - mse: 0.0046 - mae: 0.0405
Epoch 4/10
58/58 [==============================] - 5s 90ms/step - loss: 0.0043 - mse: 0.0043 - mae: 0.0394
Epoch 5/10
58/58 [==============================] - 5s 93ms/step - loss: 0.0044 - mse: 0.0044 - mae: 0.0408
Epoch 6/10
58/58 [==============================] - 6s 96ms/step - loss: 0.0038 - mse: 0.0038 - mae: 0.0392
Epoch 7/10
58/58 [==============================] - 5s 92ms/step - loss: 0.0032 - mse: 0.0032 - mae: 0.0351
Epoch 8/10
58/58 [==============================] - 6s 98ms/step - loss: 0.0035 - mse: 0.0035 - mae: 0.0360
Epoch 9/10
58/58 [==============================] - 8s 142ms/step - loss: 0.0035 - mse: 0.0035 - mae: 0.0365
Epoch 10/10
58/58 [==============================] - 6s 102ms/step - loss: 0.0041 - mse: 0.0041 - mae: 0.0393
In [81]:
#Prediction
df_volume = np.vstack((train, test))

inputs = df_volume[df_volume.shape[0] - test.shape[0] - window:]
inputs = inputs.reshape(-1,1)
inputs = sc.transform(inputs)

num_2 = df_volume.shape[0] - num_shape + window

X_test = []

for i in range(window, num_2):
    X_test_ = np.reshape(inputs[i-window:i, 0], (window, 1))
    X_test.append(X_test_)
    
X_test = np.stack(X_test)
In [82]:
predict = model.predict(X_test)
predict = sc.inverse_transform(predict)
predict
Out[82]:
array([[30.334484 ],
       [30.226282 ],
       [30.116087 ],
       ...,
       [11.862063 ],
       [11.8908205],
       [11.917963 ]], dtype=float32)
In [85]:
diff = predict - test
print("MSE:", np.mean(diff**2))
print("MAE:", np.mean(abs(diff)))
print("RMSE after Regulaization:", np.sqrt(np.mean(diff**2)))
MSE: 16.629977259070234
MAE: 3.984422708209527
RMSE after Regulaization: 4.077986912567306
In [86]:
print(predict.shape)
print(df_volume.shape)
(1207, 1)
(3107, 1)
In [89]:
#visualizing apple stock clode price prediction
%matplotlib inline
plt.figure(figsize=(20,9))
plt.plot(df['Date'].values[200:], df_volume[200:], color = 'red', label = 'Predicted Apple Stock Price')
plt.plot(df['Date'][-predict.shape[0]:].values, predict, color = 'blue', label = 'Real Apple Stock Price')
plt.title('Apple Stock Price Prediction')
plt.xlabel('Date')
plt.ylabel('Closing Price ($)')
plt.legend()
plt.show()
In [90]:
#Hence the model
In [ ]:
 
In [ ]: